sdr_data <- read.csv(here("data/SDR-2023-Data.csv"))
sdr_data <- sdr_data %>%
clean_names()
unique_colors <- c("green" = "darkseagreen", "orange" = "darkorange1", "red" = "coral2", "yellow" = "darkgoldenrod2")
goal_7_faceted_bar_plot <- ggplot(data = sdr_data, aes(x = goal_7_score,
y = reorder(country, goal_7_score),
fill = goal_7_dash)) +
geom_col(stat = "identity") +
facet_wrap(~regions_used_for_the_sdr, scales = "free_y") +
scale_fill_manual(values = unique_colors) + # Specify manual fill scale
theme_minimal() +
theme(axis.text.y = element_text(size = 4)) +
labs(x = "SDG 7 Score",
y = "")
## Warning in geom_col(stat = "identity"): Ignoring unknown parameters: `stat`
ggplotly(goal_7_faceted_bar_plot)
## Warning: Removed 27 rows containing missing values (`position_stack()`).
ggplot(sdr_data, aes(x = goal_7_score,
y = goal_1_score)) +
theme_minimal() +
geom_point() +
geom_smooth() +
stat_cor()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 42 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 42 rows containing non-finite values (`stat_cor()`).
## Warning: Removed 42 rows containing missing values (`geom_point()`).
goal_7_and_9_scatter_plot <- ggplot(sdr_data, aes(x = goal_7_score,
y = goal_1_score,
color = regions_used_for_the_sdr,
label = country)) +
theme_minimal() +
geom_point() +
scale_color_brewer(palette = "Set3")
ggplotly(goal_7_and_9_scatter_plot)
world <- ne_countries(scale = "medium", returnclass = "sf")
world <- world %>%
select(name_long, iso_a3, geometry)
# Rename a column in a data frame or matrix
colnames(sdr_data)[which(colnames(sdr_data) == "country_code_iso3")] <- "iso_a3"
joined_df <- left_join(sdr_data, world, by = "iso_a3")
world_df_joined <- st_as_sf(joined_df)
world_df_joined <- st_transform(world_df_joined, "+proj=longlat +datum=WGS84")
mytext <- paste(
"Country: ", world_df_joined$country,"<br/>",
"Goal 7 Score: ", round(world_df_joined$goal_7_score, 2),
sep="") %>%
lapply(htmltools::HTML)
leaflet(world_df_joined) %>%
addTiles() %>%
setView( lat=10, lng=0 , zoom=2) %>%
addPolygons(stroke = FALSE, fillOpacity = 0.5, smoothFactor = 0.5, color = ~colorQuantile("YlOrRd", goal_7_score)(goal_7_score), label = mytext)
Correlation matrix
sdr_scores <- sdr_data %>%
select(
goal_1_score, goal_2_score, goal_3_score, goal_4_score, goal_5_score,
goal_6_score, goal_7_score, goal_8_score, goal_9_score, goal_10_score,
goal_11_score, goal_12_score, goal_13_score, goal_14_score, goal_15_score,
goal_16_score, goal_17_score
)
sdr_scores_matrix <- as.matrix(sdr_scores)
cor <- cor(sdr_scores_matrix, use = "complete.obs")
ggcorrplot::ggcorrplot(cor, method = "circle", type = "lower", lab = TRUE)
goal 7 tile
goal_seven_data <- sdr_data %>%
select(country, regions_used_for_the_sdr, normalized_score_sdg7_elecac, normalized_score_sdg7_cleanfuel, normalized_score_sdg7_co2twh, normalized_score_sdg7_renewcon, goal_7_dash, goal_7_trend)
# Melt the data for easier plotting
# Reshape the data using pivot_longer
melted_data <- pivot_longer(goal_seven_data, cols = starts_with("normalized_score_sdg7"),
names_to = "variable", values_to = "value")
# Plotting heatmap using geom_tile
ggplot(melted_data, aes(x = variable, y = country, fill = value)) +
geom_tile(color = "white") +
scale_fill_viridis_c() +
facet_wrap(~regions_used_for_the_sdr, scales = "free_y") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, size = 5),
axis.text.y = element_text(size = 5)) +
labs(x = "", y = "", fill = "Score")
I want dash and trend also on the x axis
ggplot(sdr_data, aes(x = goal_7_score, fill = regions_used_for_the_sdr)) +
geom_histogram(color = "black") +
theme_minimal() +
scale_fill_viridis_d()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 27 rows containing non-finite values (`stat_bin()`).
Missing
gg_miss_var(sdr_scores)
ggplot(sdr_data, aes(x = goal_7_score,
y = goal_1_score,
color = regions_used_for_the_sdr,
label = country)) +
theme_minimal() +
geom_miss_point() +
scale_color_brewer(palette = "Set3")
Cleaning and Imputation for ML - Clustering and Random Forest Regression/Classification
sdr_data_normalized_scores <- sdr_data %>%
select(country, contains("normalized_score"))
gg_miss_var(sdr_data_normalized_scores, show_pct = TRUE)
sdr_data_normalized_scores_less_na <- sdr_data_normalized_scores %>%
select(where(~ sum(is.na(.))/length(.) <= 0.3))
sdr_data_imputed <- missRanger(sdr_data_normalized_scores_less_na)
##
## Missing value imputation by random forests
##
## Variables to impute: normalized_score_sdg1_wpc, normalized_score_sdg1_lmicpov, normalized_score_sdg2_undernsh, normalized_score_sdg2_stunting, normalized_score_sdg2_wasting, normalized_score_sdg2_obesity, normalized_score_sdg2_trophic, normalized_score_sdg2_crlyld, normalized_score_sdg2_snmi, normalized_score_sdg3_matmort, normalized_score_sdg3_neonat, normalized_score_sdg3_u5mort, normalized_score_sdg3_tb, normalized_score_sdg3_ncds, normalized_score_sdg3_pollmort, normalized_score_sdg3_traffic, normalized_score_sdg3_lifee, normalized_score_sdg3_fertility, normalized_score_sdg3_births, normalized_score_sdg3_vac, normalized_score_sdg3_uhc, normalized_score_sdg3_swb, normalized_score_sdg4_earlyedu, normalized_score_sdg4_primary, normalized_score_sdg4_second, normalized_score_sdg4_literacy, normalized_score_sdg5_familypl, normalized_score_sdg5_edat, normalized_score_sdg5_lfpr, normalized_score_sdg5_parl, normalized_score_sdg6_water, normalized_score_sdg6_sanita, normalized_score_sdg6_freshwat, normalized_score_sdg6_wastewat, normalized_score_sdg6_scarcew, normalized_score_sdg7_elecac, normalized_score_sdg7_cleanfuel, normalized_score_sdg7_co2twh, normalized_score_sdg7_renewcon, normalized_score_sdg8_adjgrowth, normalized_score_sdg8_slavery, normalized_score_sdg8_accounts, normalized_score_sdg8_unemp, normalized_score_sdg8_impacc, normalized_score_sdg8_impslav, normalized_score_sdg9_roads, normalized_score_sdg9_intuse, normalized_score_sdg9_mobuse, normalized_score_sdg9_lpi, normalized_score_sdg9_uni, normalized_score_sdg9_articles, normalized_score_sdg9_rdex, normalized_score_sdg10_gini, normalized_score_sdg10_palma, normalized_score_sdg11_slums, normalized_score_sdg11_pm25, normalized_score_sdg11_pipedwat, normalized_score_sdg11_transport, normalized_score_sdg12_msw, normalized_score_sdg12_ewaste, normalized_score_sdg12_so2prod, normalized_score_sdg12_so2import, normalized_score_sdg12_nprod, normalized_score_sdg12_nimport, normalized_score_sdg12_explastic, normalized_score_sdg13_co2gcp, normalized_score_sdg13_co2import, normalized_score_sdg13_co2export, normalized_score_sdg14_biomar, normalized_score_sdg15_cpta, normalized_score_sdg15_cpfa, normalized_score_sdg15_redlist, normalized_score_sdg15_forchg, normalized_score_sdg15_biofrwter, normalized_score_sdg16_homicides, normalized_score_sdg16_detain, normalized_score_sdg16_safe, normalized_score_sdg16_u5reg, normalized_score_sdg16_cpi, normalized_score_sdg16_weaponsexp, normalized_score_sdg16_rsf, normalized_score_sdg17_govex, normalized_score_sdg17_cohaven, normalized_score_sdg17_statperf
## Variables used to impute: country, normalized_score_sdg1_wpc, normalized_score_sdg1_lmicpov, normalized_score_sdg2_undernsh, normalized_score_sdg2_stunting, normalized_score_sdg2_wasting, normalized_score_sdg2_obesity, normalized_score_sdg2_trophic, normalized_score_sdg2_crlyld, normalized_score_sdg2_snmi, normalized_score_sdg3_matmort, normalized_score_sdg3_neonat, normalized_score_sdg3_u5mort, normalized_score_sdg3_tb, normalized_score_sdg3_ncds, normalized_score_sdg3_pollmort, normalized_score_sdg3_traffic, normalized_score_sdg3_lifee, normalized_score_sdg3_fertility, normalized_score_sdg3_births, normalized_score_sdg3_vac, normalized_score_sdg3_uhc, normalized_score_sdg3_swb, normalized_score_sdg4_earlyedu, normalized_score_sdg4_primary, normalized_score_sdg4_second, normalized_score_sdg4_literacy, normalized_score_sdg5_familypl, normalized_score_sdg5_edat, normalized_score_sdg5_lfpr, normalized_score_sdg5_parl, normalized_score_sdg6_water, normalized_score_sdg6_sanita, normalized_score_sdg6_freshwat, normalized_score_sdg6_wastewat, normalized_score_sdg6_scarcew, normalized_score_sdg7_elecac, normalized_score_sdg7_cleanfuel, normalized_score_sdg7_co2twh, normalized_score_sdg7_renewcon, normalized_score_sdg8_adjgrowth, normalized_score_sdg8_slavery, normalized_score_sdg8_accounts, normalized_score_sdg8_unemp, normalized_score_sdg8_impacc, normalized_score_sdg8_impslav, normalized_score_sdg9_roads, normalized_score_sdg9_intuse, normalized_score_sdg9_mobuse, normalized_score_sdg9_lpi, normalized_score_sdg9_uni, normalized_score_sdg9_articles, normalized_score_sdg9_rdex, normalized_score_sdg10_gini, normalized_score_sdg10_palma, normalized_score_sdg11_slums, normalized_score_sdg11_pm25, normalized_score_sdg11_pipedwat, normalized_score_sdg11_transport, normalized_score_sdg12_msw, normalized_score_sdg12_ewaste, normalized_score_sdg12_so2prod, normalized_score_sdg12_so2import, normalized_score_sdg12_nprod, normalized_score_sdg12_nimport, normalized_score_sdg12_explastic, normalized_score_sdg13_co2gcp, normalized_score_sdg13_co2import, normalized_score_sdg13_co2export, normalized_score_sdg14_biomar, normalized_score_sdg15_cpta, normalized_score_sdg15_cpfa, normalized_score_sdg15_redlist, normalized_score_sdg15_forchg, normalized_score_sdg15_biofrwter, normalized_score_sdg16_homicides, normalized_score_sdg16_detain, normalized_score_sdg16_safe, normalized_score_sdg16_u5reg, normalized_score_sdg16_cpi, normalized_score_sdg16_weaponsexp, normalized_score_sdg16_rsf, normalized_score_sdg17_govex, normalized_score_sdg17_cohaven, normalized_score_sdg17_statperf
## iter 1: ....................................................................................
## iter 2: ....................................................................................
## iter 3: ....................................................................................
## iter 4: ....................................................................................
## iter 5: ....................................................................................
Cluster
sdr_data_imputed <- sdr_data_imputed %>%
remove_rownames %>%
column_to_rownames(var="country")
fviz_nbclust(sdr_data_imputed, kmeans, method = "silhouette")
k2 <- kmeans(sdr_data_imputed, centers = 2)
fviz_cluster(k2, data = sdr_data_imputed) +
theme_minimal()
Find key drivers of clustering (HI Appleseed Analysis)
Random Forest
rf_matmort <- randomForest(normalized_score_sdg3_matmort ~ .,
data = sdr_data_imputed,
importance = TRUE)
rf_matmort
##
## Call:
## randomForest(formula = normalized_score_sdg3_matmort ~ ., data = sdr_data_imputed, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 27
##
## Mean of squared residuals: 85.90416
## % Var explained: 83.28
importance_df <- as.data.frame(rf_matmort$importance)
importance_df_top_10 <- importance_df %>%
rownames_to_column(var = "variable") %>%
slice_max(n = 10, order_by = `%IncMSE`)
ggplot(importance_df_top_10, aes(x = `%IncMSE`, y = reorder(variable, `%IncMSE`))) +
geom_bar(stat = "identity", fill = "steelblue", color = "black") +
theme_minimal()
Partial Dependence Plots
Week 2 Day 1 - EDA - getting to know your data, troubleshooting Week 2 Day 2 - Bar chart, basic viz, Maps, Histogram, Bubble Plots
Week 3 Day 1 - Scatterplot w/line, correlation matrices Week 3 Day 2 - Machine Learning, imputing data, clustering, random forest